C9orf72¶

In [162]:
rm(list=ls())

LIBRAIRIES AND FUNCTIONS¶

In [163]:
library(tidyverse)
library(readxl)
library(MetBrewer)
derain <- met.brewer("Derain")
demuth <- met.brewer("Demuth")
cassatt <- met.brewer("Cassatt2")
library(patchwork)
library(ggsci)
library(ggpubr)
source("../src/ggstyles.R")

DATA¶

In [164]:
# Repeats
dd <- read_excel("../data/ICM_C9_expanded_rawRepeats_withClinical_2025-08-29.xlsx")
# MetaData
dc <- read_excel("../data/ICM_C9_expanded_summary_stats_withClinical_2025-08-29.xlsx")
In [165]:
dim(dd)
dim(dc)
  1. 3348
  2. 6
  1. 47
  2. 23

Basics¶

In [166]:
head(dc,2)
A tibble: 2 × 23
SampleMolecules_countMeanMedianSDModeP90MinMaxProp_gt500⋯Prop_gt2500Prop_gt3000Prop_gt3500Prop_gt4000Prop_gt4500Prop_gt5000ConditionAPAOGender
<chr><dbl><chr><chr><chr><chr><chr><chr><chr><chr>⋯<chr><chr><chr><chr><chr><chr><chr><dbl><chr><chr>
125425642196.412500000001 1943.45833333333581229.95511349321712669.733333333332 4083.7833333333338299.5666666666657 5050.4833333333370.984375⋯0.3281250.21875 0.2031250.1093750.0625 0.015625Presymptomatic64NAM
125718641952.40755208333351697.2833333333328976.0048584062154 1278.03333333333353241.779999999998 231.833333333333375426.7999999999990.96875 ⋯0.25 0.1718750.0781250.03125 0.0156250.015625Presymptomatic41NAM
In [167]:
dim(dc)
colnames(dc)
length(unique(dc$Sample))
  1. 47
  2. 23
  1. 'Sample'
  2. 'Molecules_count'
  3. 'Mean'
  4. 'Median'
  5. 'SD'
  6. 'Mode'
  7. 'P90'
  8. 'Min'
  9. 'Max'
  10. 'Prop_gt500'
  11. 'Prop_gt1000'
  12. 'Prop_gt1500'
  13. 'Prop_gt2000'
  14. 'Prop_gt2500'
  15. 'Prop_gt3000'
  16. 'Prop_gt3500'
  17. 'Prop_gt4000'
  18. 'Prop_gt4500'
  19. 'Prop_gt5000'
  20. 'Condition'
  21. 'AP'
  22. 'AO'
  23. 'Gender'
47
In [168]:
table(dc$Condition, exclude=F)
           ALS            FTD Presymptomatic 
            17             23              7 
In [169]:
table(dc$Gender, exclude=F)
 F  M 
25 22 
In [170]:
table(dc$Condition, dc$Gender, exclude=F)
                
                  F  M
  ALS             9  8
  FTD            11 12
  Presymptomatic  5  2
In [171]:
summary(dc$AP)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  31.00   54.50   64.00   61.49   71.00   89.00 
In [172]:
dc$AO <- as.numeric(dc$AO)
Warning message:
“NAs introduits lors de la conversion automatique”
In [173]:
summary(dc$AO)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  35.00   54.00   61.00   59.78   67.00   85.00       6 
In [174]:
head(dd,2)
A tibble: 2 × 6
SamplerepeatsConditionAPAOGender
<chr><chr><chr><dbl><chr><chr>
1254252669.733333333332Presymptomatic64NAM
125425299.5666666666657Presymptomatic64NAM
In [175]:
length(unique(dd$Sample))
47
In [176]:
dd$repeats <- as.numeric(dd$repeats)
In [177]:
summary(dd$repeats)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  70.37  994.52 1532.06 1802.39 2364.17 7611.62 
In [178]:
# Number of MUT repeats per sample
dd %>%
  #count(Sample, name = "n_repeats") %>%  # count how many measurements per sample
  group_by(Sample) %>%
  summarise(n_repeats = n()) %>%
  ungroup() %>%
  ggplot(aes(x = n_repeats)) +
  geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
  labs(
    title = "Distribution of Number of Measurements per Sample",
    x = "Number of repeats per Sample",
    y = "Count of Samples"
  ) +
  theme_minimal() + gtheme(20)
No description has been provided for this image

Explore Length¶

Global¶

In [179]:
options(repr.plot.width=10, repr.plot.height=8)

ggplot(dd, aes(repeats)) + 
geom_density(size=1.9, fill="grey95", color="grey15")  + 
theme_classic() + gtheme(20) + 
  labs(
    title = "Distribution of Repeat Measurements",
    x = "Repeat Value",
    y = "Density"
  )
#geom_vline(xintercept = c(147, 157, 165), linetype='dashed') + 
#scale_x_continuous(limits=c(90,300), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1))

ggplot(dd, aes(x = repeats)) +
  geom_histogram(binwidth = 100, fill = "grey85", color = "grey15") +
  labs(
    title = "Distribution of Repeat Measurements",
    x = "Repeat Value",
    y = "Count"
  ) +
  theme_minimal(base_size = 14) + gtheme(20)
No description has been provided for this image
No description has been provided for this image
In [180]:
summary(dd$repeats)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  70.37  994.52 1532.06 1802.39 2364.17 7611.62 

Repeats x Sex¶

In [181]:
options(repr.plot.width=10, repr.plot.height=8)

ggplot(dd, aes(repeats, fill=Gender)) + 
geom_density(size=1.9, color="grey15")  + 
theme_classic() + gtheme(20) + 
  labs(
    title = "Distribution of Repeat Measurements",
    x = "Repeat Value",
    y = "Density"
  ) + topleg + scale_fill_nejm(alpha=.4)
No description has been provided for this image
In [182]:
options(repr.plot.width=5, repr.plot.height=8)

ggplot(dd, aes(y=repeats, x=Gender, fill=Gender)) + 
geom_boxplot(color="grey15")  + 
theme_classic() + gtheme(20) + 
  labs(
    #title = "Distribution of Repeat Measurements",
    y = "Repeat Value",
    x = "Sex"
  ) + noleg + stat_compare_means() + scale_fill_nejm(alpha=.4)
No description has been provided for this image
In [183]:
options(repr.plot.width=5, repr.plot.height=8)

ggplot(dd, aes(y=repeats, x=Gender)) + 
geom_violin(alpha=.3, aes(fill=Gender)) + 
geom_boxplot(outlier.shape = NA, width=.1, size=1, fill="white") +
#geom_jitter(size=4, width = .1, alpha=.6) + 
theme_classic() + gtheme(20) + 
  labs(
    #title = "Distribution of Repeat Measurements",
    y = "Repeat Value",
    x = "Sex"
  ) + noleg + stat_compare_means(size=6) + scale_fill_nejm(alpha=.4)
No description has been provided for this image

Repeats x Age¶

In [184]:
head(dc,2)
A tibble: 2 × 23
SampleMolecules_countMeanMedianSDModeP90MinMaxProp_gt500⋯Prop_gt2500Prop_gt3000Prop_gt3500Prop_gt4000Prop_gt4500Prop_gt5000ConditionAPAOGender
<chr><dbl><chr><chr><chr><chr><chr><chr><chr><chr>⋯<chr><chr><chr><chr><chr><chr><chr><dbl><dbl><chr>
125425642196.412500000001 1943.45833333333581229.95511349321712669.733333333332 4083.7833333333338299.5666666666657 5050.4833333333370.984375⋯0.3281250.21875 0.2031250.1093750.0625 0.015625Presymptomatic64NAM
125718641952.40755208333351697.2833333333328976.0048584062154 1278.03333333333353241.779999999998 231.833333333333375426.7999999999990.96875 ⋯0.25 0.1718750.0781250.03125 0.0156250.015625Presymptomatic41NAM
In [185]:
summary(dc$AP)
dd$group_age <- "64-"
dd$group_age[which(dd$AP >= 64)] <- "64+"
dd$group_age <- factor(dd$group_age, levels=c("64-","64+"))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  31.00   54.50   64.00   61.49   71.00   89.00 
In [186]:
options(repr.plot.width=10, repr.plot.height=8)

ggplot(dd, aes(repeats, fill=group_age)) + 
geom_density(size=1.9, color="grey15")  + 
theme_classic() + gtheme(20) + 
  labs(
    title = "Distribution of Repeat Measurements",
    x = "Repeat Value",
    y = "Density"
  ) + topleg + scale_fill_jama(alpha=.4)
No description has been provided for this image
In [187]:
options(repr.plot.width=5, repr.plot.height=8)

ggplot(dd, aes(y=repeats, x=group_age)) + 
geom_violin(alpha=.3, aes(fill=group_age)) + 
geom_boxplot(outlier.shape = NA, width=.1, size=1, fill="white") +
#geom_jitter(size=4, width = .1, alpha=.6) + 
theme_classic() + gtheme(20) + 
  labs(
    #title = "Distribution of Repeat Measurements",
    y = "Repeat Value",
    x = "Age"
  ) + noleg + stat_compare_means(size=6) + scale_fill_jama(alpha=.4)
No description has been provided for this image
In [188]:
## Repeats x Disease Type
In [189]:
table(dc$Condition)
           ALS            FTD Presymptomatic 
            17             23              7 
In [190]:
options(repr.plot.width=10, repr.plot.height=8)

ggplot(dd, aes(repeats, fill=Condition)) + 
geom_density(size=1.9, color="grey15")  + 
theme_classic() + gtheme(20) + 
  labs(
    title = "Distribution of Repeat Measurements",
    x = "Repeat Value",
    y = "Density"
  ) + topleg + scale_fill_jco(alpha=.4)
No description has been provided for this image
In [191]:
options(repr.plot.width=10, repr.plot.height=8)

dd %>% filter(Condition %in% c("ALS","FTD")) %>% ggplot(aes(repeats, fill=Condition)) + 
geom_density(size=1.9, color="grey15")  + 
theme_classic() + gtheme(20) + 
  labs(
    title = "Distribution of Repeat Measurements",
    x = "Repeat Value",
    y = "Density"
  ) + topleg + scale_fill_jco(alpha=.4)
No description has been provided for this image
In [192]:
options(repr.plot.width=7, repr.plot.height=8)

ggplot(dd, aes(y=repeats, x=Condition)) + 
geom_violin(alpha=.3, aes(fill=Condition)) + 
geom_boxplot(outlier.shape = NA, width=.1, size=1, fill="white") +
#geom_jitter(size=4, width = .1, alpha=.6) + 
theme_classic() + gtheme(20) + 
  labs(
    #title = "Distribution of Repeat Measurements",
    y = "Repeat Value",
    x = "Condition"
  ) + noleg + stat_compare_means(size=6) + scale_fill_jco(alpha=.4)
No description has been provided for this image

Size Region Enrichment¶

In [194]:
# Calculate Counts and Props 
dregions <- dd %>% 
group_by(Condition, round(repeats), .drop = FALSE) %>% summarise(n=n(), .groups = "drop_last") %>%
mutate(prop=n / sum(n)) %>%
pivot_wider(names_from=Condition, values_from=c(n,prop)) %>%
replace(is.na(.), 0) %>% rename(Repeat = "round(repeats)")
dregions$diff_ALS_FTD <- dregions$prop_ALS -  dregions$prop_FTD

options(repr.plot.width=12,repr.plot.height=10)

ggplot(dregions, aes(x = Repeat, y=diff_ALS_FTD)) +
  geom_point(size=1.5) +
  geom_smooth(method="loess", span=0.3, color="black", level=0.95, fill="grey55") + 
  geom_hline(yintercept=0, linetype="dashed", size=0.5) + 
  theme_classic() + gtheme(40) + 
  xlab("Length") +
  ylab("Difference in normalized count")
`geom_smooth()` using formula = 'y ~ x'
No description has been provided for this image

With binning¶

In [235]:
bin_size <- 50

dd_binned <- dd %>%
  mutate(
    repeat_bin = floor(repeats / bin_size) * bin_size
  )

dregions <- dd_binned %>% 
group_by(Condition, repeat_bin, .drop = FALSE) %>% summarise(n=n(), .groups = "drop_last") %>%
mutate(prop=n / sum(n)) %>%
pivot_wider(names_from=Condition, values_from=c(n,prop)) %>%
replace(is.na(.), 0)
dregions$diff_ALS_FTD <- dregions$prop_ALS -  dregions$prop_FTD

options(repr.plot.width=12,repr.plot.height=10)

ggplot(dregions, aes(x = repeat_bin, y=diff_ALS_FTD)) +
  geom_point(size=1.5) +
  geom_smooth(method="loess", span=0.3, color="black", level=0.95, fill="grey55") + 
  geom_hline(yintercept=0, linetype="dashed", size=0.5) + 
  theme_classic() + gtheme(30) + 
  xlab("Length") +
  ylab("Difference in normalized count (ALS - FTD)") + 
  scale_x_continuous(limits=c(70,6000))
#geom_vline()
#, breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1))
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 5 rows containing non-finite outside the scale range (`stat_smooth()`).”
Warning message:
“Removed 5 rows containing missing values or values outside the scale range (`geom_point()`).”
No description has been provided for this image
In [219]:
#seq(100,6000,50) %in% sort(unique(dd_binned$repeat_bin))

Patient-level metrics¶

In [221]:
head(dd,2)
A tibble: 2 × 7
SamplerepeatsConditionAPAOGendergroup_age
<chr><dbl><chr><dbl><chr><chr><fct>
1254252669.7333Presymptomatic64NAM64+
125425 299.5667Presymptomatic64NAM64+
In [222]:
head(dc,2)
A tibble: 2 × 23
SampleMolecules_countMeanMedianSDModeP90MinMaxProp_gt500⋯Prop_gt2500Prop_gt3000Prop_gt3500Prop_gt4000Prop_gt4500Prop_gt5000ConditionAPAOGender
<chr><dbl><chr><chr><chr><chr><chr><chr><chr><chr>⋯<chr><chr><chr><chr><chr><chr><chr><dbl><dbl><chr>
125425642196.412500000001 1943.45833333333581229.95511349321712669.733333333332 4083.7833333333338299.5666666666657 5050.4833333333370.984375⋯0.3281250.21875 0.2031250.1093750.0625 0.015625Presymptomatic64NAM
125718641952.40755208333351697.2833333333328976.0048584062154 1278.03333333333353241.779999999998 231.833333333333375426.7999999999990.96875 ⋯0.25 0.1718750.0781250.03125 0.0156250.015625Presymptomatic41NAM
In [231]:
df <- dd %>%
group_by(Sample, AP, AO, Gender, group_age, Condition) %>%
summarise(
    n_molecules = n(),
    mean = mean(repeats),
    median = median(repeats),
    min = min(repeats),
    max = min(repeats),
    sd = sd(repeats),
    prop <- sum
)
head(df)
`summarise()` has grouped output by 'Sample', 'AP', 'AO', 'Gender', 'group_age'. You can override using the `.groups`
argument.
A grouped_df: 6 × 8
SampleAPAOGendergroup_ageConditionn_moleculesmean
<chr><dbl><chr><chr><fct><chr><int><dbl>
12542564NAM64+Presymptomatic642196.413
12571841NAM64-Presymptomatic641952.408
1258396563F64+FTD 831880.500
1287437666M64+FTD 742933.664
12895235NAF64-Presymptomatic641536.888
13074739NAF64-Presymptomatic641862.891
In [232]:
summary(df$n_molecules)
summary(dc$Molecules_count)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  48.00   64.00   71.00   71.23   80.50   92.00 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  48.00   64.00   71.00   71.23   80.50   92.00 
In [ ]:

Overall Size Density¶

In [13]:
min(df$Fragment_size)
max(df$Fragment_size)
summary(df$Fragment_size)
57
476
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   57.0   154.0   164.0   164.8   173.0   476.0 
In [14]:
dim(df)
  1. 363392
  2. 46
In [15]:
table(df$Fragment_Status)
   Error_both_read_mut_with_other_alt                                   MUT 
                                   50                                 16769 
MUT_but_other_read_mut_with_other_alt                             Other_MUT 
                                  156                                   287 
                                   WT                 WT_but_other_read_mut 
                               341095                                   127 
 WT_but_other_read_mut_with_other_alt 
                                 4908 
In [20]:
options(repr.plot.width=15, repr.plot.height=7)

tmp <- df

ggplot(tmp, aes(x=Fragment_size)) + 
geom_density(size=1.9, fill="grey95", color="grey15") + 
theme_classic() + 
#theme_bw() + 
gtheme(30) + 
xlab("Fragment length (bp)") + 
ylab("Density") + 
geom_vline(xintercept = c(147, 157, 165), linetype='dashed') + 
scale_x_continuous(limits=c(90,300), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1))
#scale_x_continuous(limits=c(90,250), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 1, angle=90))
Warning message:
“Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.”
Warning message:
“Removed 521 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image
In [29]:
densFrag <- density(tmp$Fragment_size)
dens_frag <- data.frame(x=densFrag$x, y=densFrag$y)

options(repr.plot.width=11, repr.plot.height=7)
ggplot(dens_frag, aes(x=x, y=y)) + 
geom_line(size=1.9, fill="grey95", color="grey15") + 
theme_classic() + 
#theme_bw() + 
gtheme(30) + 
xlab("") + 
ylab("") + 
geom_vline(xintercept = c(112,122,133,140), linetype='dashed') + 
#scale_x_continuous(limits=c(90,144)) + ylim(c(0,0.005))
scale_x_continuous(limits=c(90,141)) + ylim(c(0,0.005))
Warning message in geom_line(size = 1.9, fill = "grey95", color = "grey15"):
“Ignoring unknown parameters: `fill`”
Warning message:
“Removed 451 rows containing missing values or values outside the scale range (`geom_line()`).”
No description has been provided for this image
In [ ]:
densFrag <- density(df$Fragment_size)
dens_frag <- data.frame(x=densFrag$x, y=densFrag$y)

options(repr.plot.width=15, repr.plot.height=7)
ggplot(dens_frag, aes(x=x, y=y)) + geom_line(size=1.5) + 
#theme_classic() + 
theme_bw() + 
gtheme(25) + 
xlab("Fragment length") + 
ylab("Density") + 
geom_vline(xintercept = c(112,123,133,140), linetype='dashed') + 
#scale_x_continuous(limits=c(90,144)) + ylim(c(0,0.005))
scale_x_continuous(limits=c(90,142)) + ylim(c(0,0.005))
#scale_x_continuous(limits=c(90,300), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2))
#scale_x_continuous(limits=c(90,250), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 1, angle=90))
In [1063]:
class(densFrag)
'density'
In [ ]:

Size Tumor vs. CH¶

In [16]:
mycol <- c("firebrick3","dodgerblue3")
In [17]:
table(df$Final_mutation_source)
table(df$Fragment_Status)
     CH   Tumor Unknown 
  95205   75129  193058 
   Error_both_read_mut_with_other_alt                                   MUT 
                                   50                                 16769 
MUT_but_other_read_mut_with_other_alt                             Other_MUT 
                                  156                                   287 
                                   WT                 WT_but_other_read_mut 
                               341095                                   127 
 WT_but_other_read_mut_with_other_alt 
                                 4908 
In [18]:
dff <- df %>% filter(Final_mutation_source %in% c("CH", "Tumor")) %>% 
filter(Fragment_Status %in% c("WT","MUT"))
In [24]:
length(unique(dff$Sample_Id))
35
In [19]:
table(dff$Final_mutation_source, dff$Fragment_Status)
       
          MUT    WT
  CH      593 93292
  Tumor  7222 66652
In [34]:
options(repr.plot.width=18, repr.plot.height=7)

library(ggtext)
tmp <- dff
tmp$Fragment_Status_2 <- gsub("MUT","Fragments with mutation",gsub("WT","Fragments with wild-type allele",tmp$Fragment_Status))
tmp$Fragment_Status_2 <- factor(tmp$Fragment_Status_2, levels=c("Fragments with wild-type allele","Fragments with mutation"))

gdens <- ggplot(tmp, aes(x=Fragment_size, color=Final_mutation_source)) + 
geom_density(size=1.5, linetype = "solid") + 
theme_classic() + gtheme(25) + 
xlab("Fragment length (bp)") + 
ylab("Density") + 
geom_vline(xintercept = c(145, 165), linetype='dashed') + 
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) + 
scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) + 
topleg +  nolegtitle + 
theme(legend.text = element_text(face="bold")) + 
theme(legend.text = element_markdown()) +  # allows colored HTML text)
scale_color_manual(values=mycol, labels = c(
      CH = "<span style='color:firebrick3'>CH</span>",
      Tumor = "<span style='color:dodgerblue3'>Tumor</span>"
    )) + 
facet_grid(.~Fragment_Status_2) 
gdens
Warning message:
“Removed 105 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image
In [35]:
ggplot(dff, aes(y=Fragment_size, x=Final_mutation_source, color=Final_mutation_source)) + 
geom_boxplot(size=1.5, linetype = "solid") + 
theme_classic() + gtheme(25) + 
topleg +  nolegtitle + 
facet_grid(.~Fragment_Status) + stat_compare_means()
No description has been provided for this image
In [36]:
wilcox.test(dff[which(dff$Fragment_Status=="WT" & dff$Final_mutation_source=="CH"),]$Fragment_size, 
            dff[which(dff$Fragment_Status=="WT" & dff$Final_mutation_source=="Tumor"),]$Fragment_size, 
            )
	Wilcoxon rank sum test with continuity correction

data:  dff[which(dff$Fragment_Status == "WT" & dff$Final_mutation_source == "CH"), ]$Fragment_size and dff[which(dff$Fragment_Status == "WT" & dff$Final_mutation_source == "Tumor"), ]$Fragment_size
W = 3185418188, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
In [37]:
wilcox.test(dff[which(dff$Fragment_Status=="MUT" & dff$Final_mutation_source=="CH"),]$Fragment_size, 
            dff[which(dff$Fragment_Status=="MUT" & dff$Final_mutation_source=="Tumor"),]$Fragment_size, 
            )
	Wilcoxon rank sum test with continuity correction

data:  dff[which(dff$Fragment_Status == "MUT" & dff$Final_mutation_source == "CH"), ]$Fragment_size and dff[which(dff$Fragment_Status == "MUT" & dff$Final_mutation_source == "Tumor"), ]$Fragment_size
W = 2344984, p-value = 0.0001149
alternative hypothesis: true location shift is not equal to 0
In [38]:
options(repr.plot.width=18, repr.plot.height=7)
ggplot(dff, aes(x=Fragment_size, color=Final_mutation_source)) + 
geom_density(size=1.5, linetype = "solid") + 
theme_classic() + gtheme(25) + 
xlab("Fragment length (bp)") + 
ylab("Density") + 
geom_vline(xintercept = c(123,133,143), linetype='dashed') + 
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) + 
scale_x_continuous(limits=c(100,150), breaks=c(100, 110, 120, 130, 140, 150), guide = guide_axis(n.dodge = 1)) + 
topleg +  nolegtitle + 
theme(legend.text = element_text(face="bold")) + 
theme(legend.text = element_markdown()) +  # allows colored HTML text)
scale_color_manual(values=mycol, labels = c(
      CH = "<span style='color:firebrick3'>CH</span>",
      Tumor = "<span style='color:dodgerblue3'>Tumor</span>"
    )) + 
facet_grid(.~Fragment_Status)
Warning message:
“Removed 135977 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image
In [39]:
ggplot(dff, aes(x=Fragment_size, color=Fragment_Status)) + 
geom_density(size=1.5) +
theme_classic() + gtheme(25) +
xlab("Fragment length") +
ylab("Density") +
geom_vline(xintercept = c(145, 167), linetype='dashed') +
scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) +
topleg +  nolegtitle + 
scale_color_manual(values=derain[c(2,7)]) + facet_grid(.~Final_mutation_source)
Warning message:
“Removed 105 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image

Manual Density to shade area¶

In [112]:
table(tmp$Fragment_Status, tmp$Final_mutation_source)
     
         CH Tumor
  MUT   593  7222
  WT  93292 66652
In [111]:
options(repr.plot.width=18, repr.plot.height=7)

library(ggtext)
tmp <- dff
tmp$Fragment_Status_2 <- gsub("MUT","Fragments with mutation",gsub("WT","Fragments with wild-type allele",tmp$Fragment_Status))
tmp$Fragment_Status_2 <- factor(tmp$Fragment_Status_2, levels=c("Fragments with wild-type allele","Fragments with mutation"))

gdens <- ggplot(tmp, aes(x=Fragment_size, color=Final_mutation_source)) + 
geom_density(size=1.5, linetype = "solid") + 
theme_classic() + gtheme(25) + 
xlab("Fragment length (bp)") + 
ylab("Density") + 
geom_vline(xintercept = c(145, 165), linetype='dashed') + 
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) + 
scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) + 
topleg +  nolegtitle + 
theme(legend.text = element_text(face="bold")) + 
theme(legend.text = element_markdown()) +  # allows colored HTML text)
scale_color_manual(values=mycol, labels = c(
      CH = "<span style='color:firebrick3'>CH</span>",
      Tumor = "<span style='color:dodgerblue3'>Tumor</span>"
    )) + 
facet_grid(.~Fragment_Status_2) 
gdens
Warning message:
“Removed 105 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image
In [41]:
hmpMUT <- dff %>% filter(dff$Fragment_Status=="MUT")

densCH_mut <- density(hmpMUT$Fragment_size[hmpMUT$Final_mutation_source == "CH"])
densTumor_mut <- density(hmpMUT$Fragment_size[hmpMUT$Final_mutation_source == "Tumor"])
dens_df_mut <- data.frame(
  x = densTumor_mut$x,
  yTumor = densTumor_mut$y,
  yCH = approx(densCH_mut$x, densCH_mut$y, xout = densTumor_mut$x)$y   # interpolate onto same x-grid
)
trapz <- function(x, y) sum( (y[-1] + y[-length(y)]) * diff(x) / 2 )
tmp <- dens_df_mut[!is.na(dens_df_mut$yTumor) & !is.na(dens_df_mut$yCH), ]
gap_area_mut <- trapz(tmp$x, abs(tmp$yTumor - tmp$yCH))
overlap_area_mut <- trapz(tmp$x, pmin(tmp$yTumor, tmp$yCH))
gap_area_mut
overlap_area_mut
# (Sanity check identity)
# overlap_area ≈ 1 - 0.5 * gap_area
1 - 0.5 * gap_area_mut
0.399990168627478
0.798504413200223
0.800004915686261
In [42]:
hmpWT <- dff %>% filter(dff$Fragment_Status=="WT")

densCH_wt <- density(hmpWT$Fragment_size[hmpWT$Final_mutation_source == "CH"])
densTumor_wt <- density(hmpWT$Fragment_size[hmpWT$Final_mutation_source == "Tumor"])
dens_df_wt <- data.frame(
  x = densTumor_wt$x,
  yTumor = densTumor_wt$y,
  yCH = approx(densCH_wt$x, densCH_wt$y, xout = densTumor_wt$x)$y   # interpolate onto same x-grid
)
trapz <- function(x, y) sum( (y[-1] + y[-length(y)]) * diff(x) / 2 )
tmp <- dens_df_wt[!is.na(dens_df_mut$yTumor) & !is.na(dens_df_mut$yCH), ]
gap_area_wt <- trapz(tmp$x, abs(tmp$yTumor - tmp$yCH))
overlap_area_wt <- trapz(tmp$x, pmin(tmp$yTumor, tmp$yCH))
gap_area_wt
overlap_area_wt
# (Sanity check identity)
# overlap_area ≈ 1 - 0.5 * gap_area
1 - 0.5 * gap_area_wt
0.0524539369747834
0.973381730512882
0.973773031512608
In [43]:
g1 <- ggplot(dens_df_wt, aes(x = x)) +
  geom_line(aes(y = yCH), color = "blue", size=1.5) +
  geom_line(aes(y = yTumor), color = "red", size=1.5) +
  geom_ribbon(aes(ymin = pmin(yCH, yTumor), ymax = pmax(yCH, yTumor)),
              fill = "grey75", alpha = 0.3) + 
  theme_classic() + gtheme(25) + 
ggtitle("xxx") + theme(plot.title = element_text(hjust = 0.5))

g2 <- ggplot(dens_df_mut, aes(x = x)) +
  geom_line(aes(y = yCH), color = "blue", size=1.5) +
  geom_line(aes(y = yTumor), color = "red", size=1.5) +
  geom_ribbon(aes(ymin = pmin(yCH, yTumor), ymax = pmax(yCH, yTumor)),
              fill = "grey75", alpha = 0.3) + 
  theme_classic() + gtheme(25) + 
ggtitle("xxx") + theme(plot.title = element_text(hjust = 0.5))

g1 + g2
Warning message:
“Removed 13 rows containing missing values or values outside the scale range (`geom_line()`).”
Warning message:
“Removed 103 rows containing missing values or values outside the scale range (`geom_line()`).”
No description has been provided for this image
In [48]:
dens_df_wt$status <- "Fragments with wild-type allele"
dens_df_mut$status <- "Fragments with mutation"
dens_df <- rbind(dens_df_wt, dens_df_mut)
dens_df$status <- factor(dens_df$status, levels=c("Fragments with wild-type allele","Fragments with mutation"))

gdens2 <- ggplot(dens_df, aes(x = x)) +
  geom_line(aes(y = yCH), color = mycol[1], size=1.8) +
  geom_line(aes(y = yTumor), color = mycol[2], size=1.8) +
  geom_ribbon(aes(ymin = pmin(yCH, yTumor), ymax = pmax(yCH, yTumor)),
              fill = "grey75", alpha = 0.3) + 
  theme_classic() + gtheme(30) +
  xlab("Fragment length (bp)") +
  ylab("Density") + facet_grid(.~status) + 
  scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1))

options(repr.plot.width=25, repr.plot.height=7)
gdens2
ggsave(gdens2, file="plots/density_ch_tumor.pdf", width=25, height=7)
ggsave(gdens2, file="plots/density_ch_tumor.png", width=25, height=7)
Warning message:
“Removed 96 rows containing missing values or values outside the scale range (`geom_line()`).”
Warning message:
“Removed 74 rows containing missing values or values outside the scale range (`geom_line()`).”
Warning message:
“Removed 96 rows containing missing values or values outside the scale range (`geom_line()`).”
Warning message:
“Removed 74 rows containing missing values or values outside the scale range (`geom_line()`).”
Warning message:
“Removed 96 rows containing missing values or values outside the scale range (`geom_line()`).”
Warning message:
“Removed 74 rows containing missing values or values outside the scale range (`geom_line()`).”
No description has been provided for this image
In [ ]:

Size regions enrichement¶

In [49]:
table(df$Fragment_Status, df$Final_mutation_source)
                                       
                                            CH  Tumor Unknown
  Error_both_read_mut_with_other_alt         7      7      36
  MUT                                      593   7222    8954
  MUT_but_other_read_mut_with_other_alt     10     53      93
  Other_MUT                                 86     60     141
  WT                                     93292  66652  181151
  WT_but_other_read_mut                     26     43      58
  WT_but_other_read_mut_with_other_alt    1191   1092    2625
In [50]:
dmut <- df %>% filter(Fragment_Status=="MUT" & Final_mutation_source%in%c("Tumor","CH"))

For each fragement length, we calculate:

  • the proportion of tumor fragments (among all tumor mutant fragements)
  • the proportion of CH fragments (among all CH mutant fragements)
  • the difference of proportions (difference in normalized counts)
In [51]:
summary(dmut$Fragment_size)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   57.0   150.0   162.0   167.5   175.0   320.0 
In [52]:
# Calculate Counts and Props 
dregions <- dmut %>% group_by(Final_mutation_source, Fragment_size, .drop = FALSE) %>% summarise(n=n(), .groups = "drop_last") %>%
mutate(prop=n / sum(n)) %>%
pivot_wider(names_from=Final_mutation_source, values_from=c(n,prop)) %>%
replace(is.na(.), 0)
dregions$diff <- dregions$prop_Tumor -  dregions$prop_CH
In [53]:
head(dregions,3)
A tibble: 3 × 6
Fragment_sizen_CHn_Tumorprop_CHprop_Tumordiff
<dbl><int><int><dbl><dbl><dbl>
85110.0016863410.0001384658-0.0015478748
98160.0016863410.0008307948-0.0008555458
100150.0016863410.0006923290-0.0009940116
In [54]:
summary(dregions$Fragment_size)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   57.0   122.8   180.5   180.6   238.2   320.0 
In [ ]:

In [55]:
# Add the missing Fragemnt size between min and max with 0 counts / prop
xall <- min(dregions$Fragment_size):max(dregions$Fragment_size)
xmissing <- xall[!xall %in% dregions$Fragment_size]
dadd <- data.frame(
    Fragment_size = xmissing,
    n_CH = 0,
    n_Tumor = 0,
    prop_CH = 0,
    prop_Tumor = 0,
    diff = 0
)
dregions2 <- rbind(dregions, dadd)
In [56]:
myrange <- 70:310
tmp <- dregions2[which(dregions2$Fragment_size>=myrange[1] & dregions2$Fragment_size<=myrange[length(myrange)]) , ]
lfit <- loess(tmp$diff ~ tmp$Fragment_size, span=0.3)
lpred <- predict(lfit, myrange, se=T)
borninf <- lpred$fit - qt(0.975, lpred$df)*lpred$se.fit
bornsup <- lpred$fit + qt(0.975, lpred$df)*lpred$se.fit

cat("95% CI inf is above 0:")
myrange[borninf>0]
cat("95% CI supp is below 0:")
myrange[bornsup<0]
95% CI inf is above 0:
  1. 132
  2. 133
  3. 134
  4. 135
  5. 136
  6. 137
  7. 138
  8. 139
  9. 140
  10. 141
  11. 142
  12. 143
  13. 144
  14. 145
  15. 146
  16. 147
  17. 148
  18. 149
  19. 150
  20. 151
  21. 152
  22. 260
  23. 261
  24. 262
  25. 263
  26. 264
  27. 265
  28. 266
  29. 267
  30. 268
  31. 269
  32. 270
  33. 271
  34. 272
  35. 273
  36. 274
95% CI supp is below 0:
  1. 159
  2. 160
  3. 161
  4. 162
  5. 163
  6. 164
  7. 165
  8. 166
  9. 167
  10. 168
  11. 169
  12. 170
  13. 171
  14. 172
  15. 173
  16. 174
  17. 175
  18. 176
  19. 177
  20. 178
  21. 179
  22. 180
  23. 181
  24. 182
  25. 183
  26. 184
  27. 185
  28. 186
  29. 187
  30. 188
  31. 189
  32. 190
  33. 191
  34. 192
  35. 193
  36. 194
  37. 195
  38. 196
In [58]:
options(repr.plot.width=12,repr.plot.height=10)

gsize <- ggplot(dregions2, aes(x = Fragment_size, y=diff)) +
  annotate("rect", xmin=132, xmax=152, ymin=-0.038, ymax=0.016, fill ='dodgerblue3', alpha=0.2) + 
  annotate("rect", xmin = 159, xmax = 196, ymin = -0.038, ymax = 0.016, fill ='firebrick3', alpha = 0.2) + 
  geom_point(size=1.5) +
  geom_smooth(method="loess", span=0.3, color="black", level=0.95, fill="grey55") + 
  geom_hline(yintercept=0, linetype="dashed", size=0.5) + 
  theme_classic() + gtheme(40) + 
  #scale_x_continuous(limits=c(100,170), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) + 
  scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) + 
  xlab("Fragment length (bp)") +
  ylab("Difference in normalized count")
gsize
ggsave(gsize, file="plots/sizesmooth_ch_tumor.pdf", width=12, height=10)
ggsave(gsize, file="plots/sizesmooth_ch_tumor.png", width=12, height=10)
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 23 rows containing non-finite outside the scale range (`stat_smooth()`).”
Warning message:
“Removed 23 rows containing missing values or values outside the scale range (`geom_point()`).”
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 23 rows containing non-finite outside the scale range (`stat_smooth()`).”
Warning message:
“Removed 23 rows containing missing values or values outside the scale range (`geom_point()`).”
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 23 rows containing non-finite outside the scale range (`stat_smooth()`).”
Warning message:
“Removed 23 rows containing missing values or values outside the scale range (`geom_point()`).”
No description has been provided for this image

try with a rolling window (later)¶

In [322]:
library(runner)
In [341]:
if (1==0) {
w <- 10

df_roll <- dmut %>%
  arrange(Fragment_size) %>%
  mutate(one = 1L) %>%
  group_by(Final_mutation_source, Fragment_size) %>%  # <-- your two factor variables here
  mutate(n_window = runner(
    x = one,
    idx = Fragment_size,        # numeric index
    k = w,             # window width in *size* units
    f = function(z) sum(z),
    na_pad = TRUE
  )) #%>%
  #ungroup()
}
In [417]:
windowsize <- 3
allrange <- 70:320

dmut_Tumor <- dmut %>% filter(Final_mutation_source=="Tumor")
dmut_CH <- dmut %>% filter(Final_mutation_source=="CH")

lres <- lapply(allrange, FUN=function(i) {
    windowmin <- (i-windowsize)
    windowmax <- (i+windowsize)
    n_CH <- sum(dmut_CH$Fragment_size >= windowmin & dmut_CH$Fragment_size <= windowmax)
    n_Tumor <- sum(dmut_Tumor$Fragment_size >= windowmin & dmut_Tumor$Fragment_size <= windowmax)
    dres <- data.frame(Fragment_size = i,
               n_CH = n_CH,
               n_Tumor = n_Tumor,
               prop_CH = n_CH/nrow(dmut_CH),
               prop_Tumor = n_Tumor/nrow(dmut_Tumor)
              )
    return(dres)
})
dregions3 <- do.call("rbind", lres)
dregions3$diff <- dregions3$prop_Tumor -  dregions3$prop_CH
In [418]:
options(repr.plot.width=12,repr.plot.height=10)
#options(repr.plot.width=20,repr.plot.height=10)

ggplot(dregions3, aes(x = Fragment_size, y=diff)) +
  #annotate("rect", xmin=144, xmax=158, ymin=-0.038, ymax=0.016, fill ='dodgerblue3', alpha=0.4) + 
  #annotate("rect", xmin = 165, xmax = 181, ymin = -0.038, ymax = 0.016, fill ='firebrick3', alpha = 0.4) + 
  #annotate("rect", xmin=132, xmax=152, ymin=-0.038, ymax=0.016, fill ='dodgerblue3', alpha=0.1) + 
  #annotate("rect", xmin = 159, xmax = 196, ymin = -0.038, ymax = 0.016, fill ='firebrick3', alpha = 0.1) + 
  geom_point() +
  geom_smooth(method="loess", span=0.6, color="black") + 
  #geom_smooth(color="black") + 
  geom_hline(yintercept=0, linetype="dashed", size=0.5) + 
  theme_classic() + gtheme(30) + 
  #scale_x_continuous(limits=c(100,170), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) + 
  scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) + 
  xlab("Fragment length (bp)") +
  ylab("Difference in normalized count")
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 10 rows containing non-finite outside the scale range (`stat_smooth()`).”
Warning message:
“Removed 10 rows containing missing values or values outside the scale range (`geom_point()`).”
No description has been provided for this image

Patient Level Tumor vs. CH¶

Make features at the patient level

  • Proportion of fragments below 150 etc
  • Other features?
In [59]:
dim(dmut)
  1. 7815
  2. 46
In [60]:
dprop <- dmut %>% group_by(Sample_Id, Final_mutation_source) %>%
summarize(n_frag=n(), n_150=sum(Fragment_size<=150),
          n_150_200=sum(Fragment_size>150 & Fragment_size<200),
          n_200=sum(Fragment_size>=200),
          n_230=sum(Fragment_size>=230)
         )
dprop$prop_150 <- dprop$n_150 / dprop$n_frag
dprop$prop_150_200 <- dprop$n_150_200 / dprop$n_frag
dprop$prop_200 <- dprop$n_200 / dprop$n_frag
dprop$prop_230 <- dprop$n_200 / dprop$n_frag
`summarise()` has grouped output by 'Sample_Id'. You can override using the `.groups` argument.
In [61]:
dlprop <- dprop[,-c(3:6)] %>%
pivot_longer(cols=c(prop_150, prop_150_200, prop_200), names_to = "size", values_to = "prop")

head(dlprop)
A grouped_df: 6 × 6
Sample_IdFinal_mutation_sourcen_230prop_230sizeprop
<chr><chr><int><dbl><chr><dbl>
ORD-1014010-01Tumor200.19811321prop_150 0.23584906
ORD-1014010-01Tumor200.19811321prop_150_2000.56603774
ORD-1014010-01Tumor200.19811321prop_200 0.19811321
ORD-1016181-01CH 20.04347826prop_150 0.19875776
ORD-1016181-01CH 20.04347826prop_150_2000.75776398
ORD-1016181-01CH 20.04347826prop_200 0.04347826
In [62]:
options(repr.plot.width=16.5,repr.plot.height=10)

dlprop$facet <- "length below 150bp"
dlprop$facet[dlprop$size=="prop_150_200"] <- "length 150-200bp"
dlprop$facet[dlprop$size=="prop_200"] <- "length over 200bp"
dlprop$facet <- factor(dlprop$facet, levels=c("length below 150bp","length 150-200bp","length over 200bp"))

ggplot(dlprop,aes(x=Final_mutation_source, y=prop, colour=Final_mutation_source)) + 
geom_boxplot(outlier.shape = NA, width=.3, size=1.5) + 
geom_jitter(size=4, width = .1, alpha=.6) + 
theme_classic() + gtheme(35) + 
facet_wrap(.~facet, scales="free") + 
stat_compare_means(size=7, label.y.npc = 1, label.x.npc = .25, aes(label = sprintf("p = %s", after_stat(p.format)))) + 
theme(axis.text.x = element_text(colour = mycol, face="bold")) + 
noleg + 
scale_color_manual(values=mycol) +
ylab("Proportion of mutant fragments per patient") + 
xlab(expression(paste(italic(TP53)," Y220C Mutation Origin"))) + 
theme(strip.background =element_rect(fill="grey90"))
Warning message:
“Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.”
No description has been provided for this image
In [63]:
options(repr.plot.width=16.5,repr.plot.height=11)

dlprop$facet <- "length below 150bp"
dlprop$facet[dlprop$size=="prop_150_200"] <- "length 150-200bp"
dlprop$facet[dlprop$size=="prop_200"] <- "length over 200bp"
dlprop$facet <- factor(dlprop$facet, levels=c("length below 150bp","length 150-200bp","length over 200bp"))

gprop <- ggplot(dlprop,aes(x=Final_mutation_source, y=prop, colour=Final_mutation_source)) + 
geom_boxplot(outlier.shape = NA, width=.3, size=1.5) + 
geom_jitter(size=4, width = .1, alpha=.6) + 
theme_classic() + gtheme(35) + 
facet_wrap(.~facet, scales="free") + 
stat_compare_means(size=7, label.y.npc = 1, label.x.npc = .25, aes(label = sprintf("p = %s", after_stat(p.format)))) + 
scale_x_discrete(labels=c("CH\nn=19","Tumor\nn=16")) + 
theme(axis.text.x = element_text(colour = mycol, face="bold")) + 
noleg + 
scale_color_manual(values=mycol) +
ylab("Proportion of mutant fragments per patient") + 
xlab(expression(paste(italic(TP53)," Y220C Mutation Origin"))) + 
theme(strip.background =element_rect(fill="grey90"))
gprop

ggsave(gprop, file="plots/propsize_ch_tumor.pdf", width=16.5, height=11)
ggsave(gprop, file="plots/propsize_ch_tumor.png", width=16.5, height=11)
Warning message:
“Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.”
No description has been provided for this image

Distance Max Mut Peak and 167 / Max Mut WT Peaks¶

In [64]:
dd.select <- dd %>% filter(Final_mutation_source %in% c("CH","Tumor"))
In [65]:
table(dff$Fragment_Status, dff$Final_mutation_source)
     
         CH Tumor
  MUT   593  7222
  WT  93292 66652
In [66]:
trapz <- function(x, y) sum( (y[-1] + y[-length(y)]) * diff(x) / 2 )
lrespeak <- lapply(dd.select$samples_ID, FUN=function(ord) {
    ww <- dff %>% filter(Sample_Id == ord)
    nmut <- sum(ww$Fragment_Status=="MUT")
    nwt <- sum(ww$Fragment_Status=="WT")

    densmut <- density(ww[ww$Fragment_Status=="MUT",]$Fragment_size)
    maxmut <- densmut$x[which.max(densmut$y)]
    denswt <- density(ww[ww$Fragment_Status=="WT",]$Fragment_size)
    maxwt <- denswt$x[which.max(denswt$y)]

    dens_df <- data.frame(
        x = denswt$x,
        y = denswt$y,
        yy = approx(densmut$x, densmut$y, xout = denswt$x)$y   # interpolate onto same x-grid
    )
    tmp <- dens_df[!is.na(dens_df$y) & !is.na(dens_df$yy), ]
    gap_area <- trapz(tmp$x, abs(tmp$y - tmp$yy))
    overlap_area <- trapz(tmp$x, pmin(tmp$y, tmp$yy))
    dres <- data.frame(sample=ord, n_mut=nmut, n_wt=nwt, peak_mut=maxmut, peak_wt=maxwt, gap=gap_area, overlap=overlap_area)
    return(dres)
})
drespeak <- do.call("rbind",lrespeak)
In [67]:
head(drespeak,3)
A data.frame: 3 × 7
samplen_mutn_wtpeak_mutpeak_wtgapoverlap
<chr><int><int><dbl><dbl><dbl><dbl>
1ORD-1014010-011061632158.6660166.39620.44540240.7765453
2ORD-1016181-011613988166.9185166.25910.15160530.9237243
3ORD-1110419-01 583575154.7055165.30760.34615320.8267282
In [68]:
tt <- left_join(drespeak, dd[,c("samples_ID","Final_mutation_source")], by=c("sample"="samples_ID"))
tt <- tt %>% filter(n_mut>=10)
table(tt$Final_mutation_source)

gpeakmut <- ggplot(tt,aes(x=Final_mutation_source,
                       y=peak_mut,
                       #y=(peak_wt - peak_mut), 
                       colour=Final_mutation_source)) + 
geom_violin(alpha=.3, aes(fill=Final_mutation_source)) + 
geom_boxplot(outlier.shape = NA, width=.1, size=1, fill="white") +
#geom_boxplot(outlier.shape = NA, width=.3, size=1.5) + 
geom_jitter(size=4, width = .1, alpha=.6) + 
theme_classic() + gtheme(35) + 
stat_compare_means(size=7, label.y.npc = 1, label.x.npc = .3) +
theme(axis.text.x = element_text(colour = mycol, face="bold")) + 
noleg + 
scale_color_manual(values=mycol) + scale_fill_manual(values=mycol) + 
scale_x_discrete(labels=c("CH\nn=17","Tumor\nn=15")) + 
ylab("Size Distribution Peak (bp)") + 
xlab(expression(paste(italic(TP53)," Y220C Mutation Origin"))) +
facet_grid(.~"Mutant Fragments") + theme(strip.background =element_rect(fill="grey90"))

gpeakwt<- ggplot(tt,aes(x=Final_mutation_source,
                       y=peak_wt,
                       #y=(peak_wt - peak_mut), 
                       colour=Final_mutation_source)) + 
geom_violin(alpha=.3, aes(fill=Final_mutation_source)) + 
geom_boxplot(outlier.shape = NA, width=.1, size=1, fill="white") +
#geom_boxplot(outlier.shape = NA, width=.3, size=1.5) + 
geom_jitter(size=4, width = .1, alpha=.6) + 
theme_classic() + gtheme(35) + 
stat_compare_means(size=7, label.y.npc = 1, label.x.npc = .3) +
theme(axis.text.x = element_text(colour = mycol, face="bold")) + 
noleg + 
scale_color_manual(values=mycol) + scale_fill_manual(values=mycol) + 
scale_x_discrete(labels=c("CH\nn=17","Tumor\nn=15")) + 
ylab("Wild-Type Fragment\nSize Distribution Peak (bp)") + 
xlab(expression(paste(italic(TP53)," Y220C Mutation Origin"))) +
facet_grid(.~"Wild-type Fragments") + theme(strip.background =element_rect(fill="grey90"))

options(repr.plot.width=16,repr.plot.height=10)
#gpeakmut + gpeakwt
   CH Tumor 
   17    15 
Warning message:
“Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.”
Warning message:
“Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.”
In [69]:
summary(tt$n_mut)
summary(tt$n_wt)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  11.00   17.75   41.00  243.50   85.25 3387.00 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1632    3376    4320    4442    5758    7744 
In [70]:
dtmp <- tt %>% pivot_longer(cols = c("peak_wt","peak_mut"), names_to="status", values_to="peak")
dtmp$status2 <- "Mutant Fragments"
dtmp$status2[dtmp$status=="peak_wt"] <- "Wild-type Fragments"

gpeak <- ggplot(dtmp,aes(x=Final_mutation_source,
                       y=peak,
                       colour=Final_mutation_source)) + 
geom_violin(alpha=.3, aes(fill=Final_mutation_source)) + 
geom_boxplot(outlier.shape = NA, width=.1, size=1, fill="white") +
#geom_boxplot(outlier.shape = NA, width=.3, size=1.5) + 
geom_jitter(size=4, width = .1, alpha=.6) + 
theme_classic() + gtheme(35) + 
stat_compare_means(size=7, label.y.npc = 1, label.x.npc = .3) +
theme(axis.text.x = element_text(colour = mycol, face="bold")) + 
noleg + 
scale_color_manual(values=mycol) + scale_fill_manual(values=mycol) + 
scale_x_discrete(labels=c("CH\nn=17","Tumor\nn=15")) + 
ylab("Size Distribution Peak (bp)") + 
xlab(expression(paste(italic(TP53)," Y220C Mutation Origin"))) +
facet_wrap(.~status2, scales="free_y") + theme(strip.background =element_rect(fill="grey90"))

options(repr.plot.width=13,repr.plot.height=11)
gpeak
ggsave(gpeak, file="plots/peak_ch_tumor.pdf", width=13, height=11)
ggsave(gpeak, file="plots/peak_ch_tumor.png", width=13, height=11)
Warning message:
“Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.”
No description has been provided for this image
In [72]:
dtmp %>% group_by(Final_mutation_source, status2) %>% summarize(median(peak))
`summarise()` has grouped output by 'Final_mutation_source'. You can override using the `.groups` argument.
A grouped_df: 4 × 3
Final_mutation_sourcestatus2median(peak)
<chr><chr><dbl>
CH Mutant Fragments 164.4001
CH Wild-type Fragments165.8582
TumorMutant Fragments 158.6660
TumorWild-type Fragments165.5992

Plot Density With Peak Estimates for each Patient¶

In [73]:
colmutwt <- derain[c(3,5)]
In [74]:
head(drespeak,3)
A data.frame: 3 × 7
samplen_mutn_wtpeak_mutpeak_wtgapoverlap
<chr><int><int><dbl><dbl><dbl><dbl>
1ORD-1014010-011061632158.6660166.39620.44540240.7765453
2ORD-1016181-011613988166.9185166.25910.15160530.9237243
3ORD-1110419-01 583575154.7055165.30760.34615320.8267282
In [75]:
table(tt$Final_mutation_source)
   CH Tumor 
   17    15 

CH¶

In [76]:
#ord <- "ORD-1014010-01"
ttch <- tt %>% filter(Final_mutation_source=="CH")
ww <- dff %>% filter(Sample_Id%in%(ttch$sample))
dfacet <- ttch
dfacet$Sample_Id <- dfacet$sample

gallch <- ggplot(ww) + 
geom_density(size=2, aes(x=Fragment_size, color=Fragment_Status)) +
theme_classic() + gtheme(30) +
xlab("Fragment length (bp)") +
ylab("Density") +
geom_vline(data=dfacet,aes(xintercept=peak_mut),col=colmutwt[1], inherit.aes = FALSE, linetype="dashed", size=1) +
geom_vline(data=dfacet,aes(xintercept=peak_wt),col=colmutwt[2], inherit.aes = FALSE, linetype="dashed", size=1) +
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) +
scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) +
geom_text(data=dfacet, aes(x=250, y=.01, label=paste("N mut =",n_mut)), size=8, color=colmutwt[1], fontface="bold") +
geom_text(data=dfacet, aes(x=250, y=.014, label=paste("N wt =",n_wt)), size=8, color=colmutwt[2], fontface="bold") +
facet_wrap(.~Sample_Id, scales="free_y", ncol=4) + 
topleg + 
scale_color_manual(name="Mutation status of fragments at the TP53 Y220 locus", values=colmutwt, labels=c("Mutant","Wild-type")) + 
theme(legend.box="horizontal")

options(repr.plot.width=29,repr.plot.height=25)
gallch
Warning message in geom_vline(data = dfacet, aes(xintercept = peak_mut), col = colmutwt[1], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message in geom_vline(data = dfacet, aes(xintercept = peak_wt), col = colmutwt[2], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message:
“Removed 55 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image

Tumor¶

In [77]:
tttumor <- tt %>% filter(Final_mutation_source=="Tumor")
ww <- dff %>% filter(Sample_Id%in%(tttumor$sample))
dfacet <- tttumor
dfacet$Sample_Id <- dfacet$sample

galltumor <- ggplot(ww) + 
geom_density(size=2, aes(x=Fragment_size, color=Fragment_Status)) +
theme_classic() + gtheme(30) +
xlab("Fragment length (bp)") +
ylab("Density") +
geom_vline(data=dfacet,aes(xintercept=peak_mut),col=colmutwt[1], inherit.aes = FALSE, linetype="dashed", size=1) +
geom_vline(data=dfacet,aes(xintercept=peak_wt),col=colmutwt[2], inherit.aes = FALSE, linetype="dashed", size=1) +
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) +
scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) +
geom_text(data=dfacet, aes(x=250, y=.01, label=paste("N mut =",n_mut)), size=8, color=colmutwt[1], fontface="bold") +
geom_text(data=dfacet, aes(x=250, y=.014, label=paste("N wt =",n_wt)), size=8, color=colmutwt[2], fontface="bold") +
facet_wrap(.~Sample_Id, scales="free_y") + 
topleg + 
scale_color_manual(name="Mutation status of fragments at the TP53 Y220 locus", values=colmutwt, labels=c("Mutant","Wild-type")) + 
theme(legend.box="horizontal")

options(repr.plot.width=28,repr.plot.height=19)
galltumor
Warning message in geom_vline(data = dfacet, aes(xintercept = peak_mut), col = colmutwt[1], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message in geom_vline(data = dfacet, aes(xintercept = peak_wt), col = colmutwt[2], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message:
“Removed 47 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image

ORD-1210835-01 / ORD-1016181-01¶

In [80]:
#ww$Sample_Id
#tt$sample
In [96]:
unique(ww$Sample_Id)
  1. 'ORD-1016181-01'
  2. 'ORD-1210835-01'
In [110]:
ttord <- tt %>% filter(sample%in%c("ORD-1210835-01","ORD-1016181-01"))
ww <- dff %>% filter(Sample_Id%in%c("ORD-1210835-01","ORD-1016181-01"))
dfacet <- ttord
dfacet$Sample_Id <- dfacet$sample
ww$sample2 <- NA
ww$sample2[which(ww$Sample_Id=="ORD-1210835-01")] <- "Representative patient with tumor-derived mut."
ww$sample2[which(ww$Sample_Id=="ORD-1016181-01")] <- "Representative patient with CH-derived mut."
dfacet <- left_join(dfacet, ww[,c("Sample_Id","sample2")], by="Sample_Id")

ggo <- ggplot(ww) + 
geom_density(size=2.5, aes(x=Fragment_size, color=Fragment_Status)) +
theme_classic() + gtheme(35) +
xlab("Fragment length (bp)") +
ylab("Density") +
geom_vline(data=dfacet,aes(xintercept=peak_mut),col=colmutwt[1], inherit.aes = FALSE, linetype="dashed", size=1.5) +
geom_vline(data=dfacet,aes(xintercept=peak_wt),col=colmutwt[2], inherit.aes = FALSE, linetype="dashed", size=1.5) +
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) +
scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) +
geom_text(data=dfacet, aes(x=250, y=.005, label=paste("N mutant fragments =",n_mut)), size=8, color=colmutwt[1]) +
geom_text(data=dfacet, aes(x=250, y=.008, label=paste("N wild-type fragments =",n_wt)), size=8, color=colmutwt[2]) +
facet_wrap(.~sample2, scales="free_y", ncol=1) + 
theme(legend.position='left') + 
scale_color_manual(name="Status\nof fragments\nat TP53 Y220", 
                   values=colmutwt, labels=c("Mutant","Wild-type")) + 
theme(legend.box="horizontal")

options(repr.plot.width=15,repr.plot.height=12)
ggo
Warning message in geom_vline(data = dfacet, aes(xintercept = peak_mut), col = colmutwt[1], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message in geom_vline(data = dfacet, aes(xintercept = peak_wt), col = colmutwt[2], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message:
“Removed 3 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image

MVA?¶

In [968]:
library(forestmodel)
In [969]:
head(dprop,2)
head(drespeak,2)
A grouped_df: 2 × 11
Sample_IdFinal_mutation_sourcen_fragn_150n_150_200n_200n_230prop_150prop_150_200prop_200prop_230
<chr><chr><int><int><int><int><int><dbl><dbl><dbl><dbl>
ORD-1014010-01Tumor10625 6021200.23584910.56603770.198113210.19811321
ORD-1016181-01CH 16132122 7 20.19875780.75776400.043478260.04347826
A data.frame: 2 × 7
samplen_mutn_wtpeak_mutpeak_wtgapoverlap
<chr><int><int><dbl><dbl><dbl><dbl>
1ORD-1014010-011061632158.6660166.39620.44540240.7765453
2ORD-1016181-011613988166.9185166.25910.15160530.9237243
In [970]:
summary(dd.select$nb_frag_MUT_fragmentomics)
summary(dd.select$nb_frag_WT_fragmentomics)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    6.0    15.0    37.0   223.3    82.0  3387.0 
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1639    3531    4725    4582    5775    7759 
In [997]:
options(warn = -1)

dmva <- left_join(dd.select, dprop[,c("Sample_Id","prop_150","prop_150_200","prop_200","prop_230")], 
                  by=c("samples_ID"="Sample_Id"))
dmva <- left_join(dmva, drespeak,by=c("samples_ID"="sample"))
                  
dmva$VAF <- 100*as.numeric(dmva$VAF_FMI_TP53_Y220C) / 5

dmva$response <- 1
dmva$response[dmva$Final_mutation_source=="CH"] <- 0

dmva <- dmva %>% filter(n_mut >= 10)

dmva$prop <- 100*dmva$prop_150
dmva$prop230 <- 100*dmva$prop_230
dmva$prop200 <- 100*dmva$prop_200
dmva$dist <- dmva$peak_wt - dmva$peak_mut
dmva$peak <- dmva$peak_mut

#rest <- glm(formula = response ~ VAF +  peak_mut + prop230, family="binomial", data=dmva)
rest <- glm(formula = response ~ VAF +  peak_mut + prop200, family="binomial", data=dmva)
#rest <- glm(formula = response ~ VAF + peak_mut, family="binomial", data=dmva)
#rest <- glm(formula = response ~ VAF + peak + overlap, family="binomial", data=dmva)
#rest <- glm(formula = response ~ VAF_FMI_TP53_Y220C + prop_150 + prop_150_200 + prop_200, family="binomial", data=dmva)
#rest
options(repr.plot.width=10,repr.plot.height=6)
forest_model(rest,
             format_options = forest_model_format_options(text_size = 6, shape=18)
            )
No description has been provided for this image
In [1016]:
options(warn = -1)

dmva <- left_join(dd.select, dprop[,c("Sample_Id","prop_150","prop_150_200","prop_200","prop_230")], 
                  by=c("samples_ID"="Sample_Id"))
dmva <- left_join(dmva, drespeak,by=c("samples_ID"="sample"))
                  
dmva$VAF <- 100*as.numeric(dmva$VAF_FMI_TP53_Y220C) / 5
dmva[,'VAF by 5% increase'] <- dmva$VAF

dmva$response <- 1
dmva$response[dmva$Final_mutation_source=="CH"] <- 0

dmva <- dmva %>% filter(n_mut >= 10)

dmva$prop <- 100*dmva$prop_150
dmva$prop230 <- 100*dmva$prop_230
dmva$prop200 <- 100*dmva$prop_200
dmva[,'Proportion of Mutant Fragments > 200bp'] <- 100*dmva$prop_200
dmva$dist <- dmva$peak_wt - dmva$peak_mut
dmva$peak <- dmva$peak_mut
dmva[,'Mutant Fragments Peak Size'] <- dmva$peak_mut

rest <- glm(formula = response ~ `VAF by 5% increase` +  `Mutant Fragments Peak Size` + `Proportion of Mutant Fragments > 200bp`, family="binomial", data=dmva)
#rest
options(repr.plot.width=20,repr.plot.height=6)
forest_model(rest,
             format_options = forest_model_format_options(text_size = 8, shape=18)
            ) + gtheme(20)
No description has been provided for this image
In [1018]:
table(dmva$response)
 0  1 
17 15 

Case Report¶

Read data¶

In [120]:
# Available on NextCloud: 
# data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup
dircc <- "../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup" 
cc1.stk <- read_table(paste0(dircc,"/","ORD-1799945-01_EGFR_STK11_remove_duplicates.tsv"))
cc1.tp53 <- read_table(paste0(dircc,"/","ORD-1799945-01_TP53_remove_duplicates.tsv"))
cc2.stk <- read_table(paste0(dircc,"/","ORD-1920359-01_EGFR_STK11_remove_duplicates.tsv"))
cc2.tp53 <- read_table(paste0(dircc,"/","ORD-1920359-01_TP53_remove_duplicates.tsv"))
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  Position = col_double(),
  Alt = col_logical(),
  Fragment_size = col_double(),
  Inner_distance = col_double(),
  Read_5p = col_double(),
  MAPQ_5p = col_double(),
  MAPQ_3p = col_double(),
  BASE_5p = col_logical(),
  BASQ_5p = col_logical(),
  Pos_bam_5p = col_double(),
  Pos_bam_3p = col_double(),
  TLEN = col_double(),
  Nb_fragment_bases_softclip_5p = col_double(),
  Nb_fragment_bases_softclip_3p = col_double(),
  VAF = col_double(),
  Start_test = col_double(),
  Start_test_read_5p = col_double(),
  Start_test_read_3p = col_double(),
  Startpos_read_5p_cor = col_double(),
  Startpos_read_3p_cor = col_double()
)
ℹ Use `spec()` for the full column specifications.

Warning message:
“43626 parsing failures.
 row     col           expected               actual                                                                                                                                                                            file
1381 BASE_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_EGFR_STK11_remove_duplicates.tsv'
1381 BASQ_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_EGFR_STK11_remove_duplicates.tsv'
1382 BASE_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_EGFR_STK11_remove_duplicates.tsv'
1382 BASQ_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_EGFR_STK11_remove_duplicates.tsv'
1383 BASE_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_EGFR_STK11_remove_duplicates.tsv'
.... ....... .................. .................... ...............................................................................................................................................................................
See problems(...) for more details.
”

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  Position = col_double(),
  Ref = col_logical(),
  Fragment_size = col_double(),
  Inner_distance = col_double(),
  Read_5p = col_double(),
  MAPQ_5p = col_double(),
  MAPQ_3p = col_double(),
  Pos_bam_5p = col_double(),
  Pos_bam_3p = col_double(),
  TLEN = col_double(),
  Nb_fragment_bases_softclip_5p = col_double(),
  Nb_fragment_bases_softclip_3p = col_double(),
  VAF = col_logical(),
  Start_test = col_double(),
  Start_test_read_5p = col_double(),
  Start_test_read_3p = col_double(),
  Startpos_read_5p_cor = col_double(),
  Startpos_read_3p_cor = col_double()
)
ℹ Use `spec()` for the full column specifications.

Warning message:
“140724 parsing failures.
 row col           expected              actual                                                                                                                                                                      file
7729 Ref 1/0/T/F/TRUE/FALSE C                   '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_TP53_remove_duplicates.tsv'
7729 VAF 1/0/T/F/TRUE/FALSE 0.00611147327249022 '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_TP53_remove_duplicates.tsv'
7730 Ref 1/0/T/F/TRUE/FALSE C                   '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_TP53_remove_duplicates.tsv'
7730 VAF 1/0/T/F/TRUE/FALSE 0.00611147327249022 '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_TP53_remove_duplicates.tsv'
7731 Ref 1/0/T/F/TRUE/FALSE C                   '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1799945-01_TP53_remove_duplicates.tsv'
.... ... .................. ................... .........................................................................................................................................................................
See problems(...) for more details.
”

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  Position = col_double(),
  Alt = col_logical(),
  Fragment_size = col_double(),
  Inner_distance = col_double(),
  Read_5p = col_double(),
  MAPQ_5p = col_double(),
  MAPQ_3p = col_double(),
  BASE_5p = col_logical(),
  BASQ_5p = col_logical(),
  Pos_bam_5p = col_double(),
  Pos_bam_3p = col_double(),
  TLEN = col_double(),
  Nb_fragment_bases_softclip_5p = col_double(),
  Nb_fragment_bases_softclip_3p = col_double(),
  VAF = col_double(),
  Start_test = col_double(),
  Start_test_read_5p = col_double(),
  Start_test_read_3p = col_double(),
  Startpos_read_5p_cor = col_double(),
  Startpos_read_3p_cor = col_double()
)
ℹ Use `spec()` for the full column specifications.

Warning message:
“41345 parsing failures.
 row     col           expected               actual                                                                                                                                                                            file
1247 BASE_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_EGFR_STK11_remove_duplicates.tsv'
1247 BASQ_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_EGFR_STK11_remove_duplicates.tsv'
1326 BASE_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_EGFR_STK11_remove_duplicates.tsv'
1326 BASQ_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_EGFR_STK11_remove_duplicates.tsv'
1327 BASE_5p 1/0/T/F/TRUE/FALSE no_deletion_detected '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_EGFR_STK11_remove_duplicates.tsv'
.... ....... .................. .................... ...............................................................................................................................................................................
See problems(...) for more details.
”

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  Position = col_double(),
  Ref = col_logical(),
  Fragment_size = col_double(),
  Inner_distance = col_double(),
  Read_5p = col_double(),
  MAPQ_5p = col_double(),
  MAPQ_3p = col_double(),
  Pos_bam_5p = col_double(),
  Pos_bam_3p = col_double(),
  TLEN = col_double(),
  Nb_fragment_bases_softclip_5p = col_double(),
  Nb_fragment_bases_softclip_3p = col_double(),
  VAF = col_double(),
  Start_test = col_double(),
  Start_test_read_5p = col_double(),
  Start_test_read_3p = col_double(),
  Startpos_read_5p_cor = col_double(),
  Startpos_read_3p_cor = col_double()
)
ℹ Use `spec()` for the full column specifications.

Warning message:
“57844 parsing failures.
 row col           expected actual                                                                                                                                                                      file
7518 Ref 1/0/T/F/TRUE/FALSE      C '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_TP53_remove_duplicates.tsv'
7519 Ref 1/0/T/F/TRUE/FALSE      C '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_TP53_remove_duplicates.tsv'
7520 Ref 1/0/T/F/TRUE/FALSE      C '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_TP53_remove_duplicates.tsv'
7521 Ref 1/0/T/F/TRUE/FALSE      C '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_TP53_remove_duplicates.tsv'
7522 Ref 1/0/T/F/TRUE/FALSE      C '../../../data/fRagmentomics_v0.1.0_package/results/2025-05-22_results_fRagmentomics_v0.1.0_case_report_TP53_Y220C_without_dup/ORD-1920359-01_TP53_remove_duplicates.tsv'
.... ... .................. ...... .........................................................................................................................................................................
See problems(...) for more details.
”

Plot STK11 / TP53 Y220C¶

In [192]:
stk1 <- cc1.stk %>% filter(Chromosome=="chr19") %>% filter(Fragment_Status %in% c("MUT","WT"))
stk2 <- cc2.stk %>% filter(Chromosome=="chr19") %>% filter(Fragment_Status %in% c("MUT","WT"))
yc1 <- cc1.tp53 %>% filter(Position==7578190) %>% filter(Fragment_Status %in% c("MUT","WT"))
yc2 <- cc2.tp53 %>% filter(Position==7578190) %>% filter(Fragment_Status %in% c("MUT","WT"))
In [197]:
# STK11 - 1
nmut.stk1 <- sum(stk1$Fragment_Status=="MUT")
nwt.stk1 <- sum(stk1$Fragment_Status=="WT")
densmut.stk1 <- density(stk1[stk1$Fragment_Status=="MUT",]$Fragment_size)
denswt.stk1 <- density(stk1[stk1$Fragment_Status=="WT",]$Fragment_size)
peakmut.stk1 <- densmut.stk1$x[which.max(densmut.stk1$y)]
peakwt.stk1 <- denswt.stk1$x[which.max(denswt.stk1$y)]
dres.stk1 <- data.frame(sample="Pre-traitement STK11", 
                        n_mut=nmut.stk1, n_wt=nwt.stk1, peak_mut=peakmut.stk1, peak_wt=peakwt.stk1)
# STK11 - 2
nmut.stk2 <- sum(stk2$Fragment_Status=="MUT")
nwt.stk2 <- sum(stk2$Fragment_Status=="WT")
densmut.stk2 <- density(stk2[stk2$Fragment_Status=="MUT",]$Fragment_size)
denswt.stk2 <- density(stk2[stk2$Fragment_Status=="WT",]$Fragment_size)
peakmut.stk2 <- densmut.stk2$x[which.max(densmut.stk2$y)]
peakwt.stk2 <- denswt.stk2$x[which.max(denswt.stk2$y)]
dres.stk2 <- data.frame(sample="Post-traitement STK11", 
                        n_mut=nmut.stk2, n_wt=nwt.stk2, peak_mut=peakmut.stk2, peak_wt=peakwt.stk2)
In [199]:
# Y220C - 1
nmut.yc1 <- sum(yc1$Fragment_Status=="MUT")
nwt.yc1 <- sum(yc1$Fragment_Status=="WT")
densmut.yc1 <- density(yc1[yc1$Fragment_Status=="MUT",]$Fragment_size)
denswt.yc1 <- density(yc1[yc1$Fragment_Status=="WT",]$Fragment_size)
peakmut.yc1 <- densmut.yc1$x[which.max(densmut.yc1$y)]
peakwt.yc1 <- denswt.yc1$x[which.max(denswt.yc1$y)]
dres.yc1 <- data.frame(sample="Pre-traitement STK11", 
                        n_mut=nmut.yc1, n_wt=nwt.yc1, peak_mut=peakmut.yc1, peak_wt=peakwt.yc1)

# Y220C - 2
nmut.yc2 <- sum(yc2$Fragment_Status=="MUT")
nwt.yc2 <- sum(yc2$Fragment_Status=="WT")
densmut.yc2 <- density(yc2[yc2$Fragment_Status=="MUT",]$Fragment_size)
denswt.yc2 <- density(yc2[yc2$Fragment_Status=="WT",]$Fragment_size)
peakmut.yc2 <- densmut.yc2$x[which.max(densmut.yc2$y)]
peakwt.yc2 <- denswt.yc2$x[which.max(denswt.yc2$y)]
dres.yc2 <- data.frame(sample="Post-traitement STK11", 
                        n_mut=nmut.yc2, n_wt=nwt.yc2, peak_mut=peakmut.yc2, peak_wt=peakwt.yc2)
In [157]:
dres.stk1
dres.stk2
A data.frame: 1 × 5
samplen_mutn_wtpeak_mutpeak_wt
<chr><int><int><dbl><dbl>
Pre-traitement STK115537919135.1001166.1273
A data.frame: 1 × 5
samplen_mutn_wtpeak_mutpeak_wt
<chr><int><int><dbl><dbl>
Post-traitement STK118067157132.855166.1809
In [185]:
ggstk1 <- ggplot(stk1) + 
geom_density(size=3.5, aes(x=Fragment_size, color=Fragment_Status)) +
theme_classic() + gtheme(40) +
xlab("Fragment length (bp)") +
ylab("Density") +
geom_vline(data=dres.stk1,aes(xintercept=peak_mut),col=colmutwt[1], inherit.aes = FALSE, linetype="dashed", size=2.5) +
geom_vline(data=dres.stk1,aes(xintercept=peak_wt),col=colmutwt[2], inherit.aes = FALSE, linetype="dashed", size=2.5) +
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) +
scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) +
geom_text(data=dres.stk1, aes(x=247, y=.026, label=paste("N mutant fragments =",n_mut)), size=11, color=colmutwt[1]) +
geom_text(data=dres.stk1, aes(x=247, y=.03, label=paste("N wild-type fragments =",n_wt)), size=11, color=colmutwt[2]) +
#facet_wrap(.~sample2, scales="free_y", ncol=1) + 
noleg + 
#theme(legend.position='left') + 
scale_color_manual(name="Status\nof fragments\nat TP53 Y220", 
                   values=colmutwt, labels=c("Mutant","Wild-type")) + 
theme(legend.box="horizontal")

options(repr.plot.width=12,repr.plot.height=6)
ggstk1
Warning message in geom_vline(data = dres.stk1, aes(xintercept = peak_mut), col = colmutwt[1], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message in geom_vline(data = dres.stk1, aes(xintercept = peak_wt), col = colmutwt[2], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message:
“Removed 2 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image
In [202]:
ggstk2 <- ggplot(stk2) + 
geom_density(size=3.5, aes(x=Fragment_size, color=Fragment_Status)) +
theme_classic() + gtheme(40) +
xlab("Fragment length (bp)") +
ylab("Density") +
geom_vline(data=dres.stk2,aes(xintercept=peak_mut),col=colmutwt[1], inherit.aes = FALSE, linetype="dashed", size=2.5) +
geom_vline(data=dres.stk2,aes(xintercept=peak_wt),col=colmutwt[2], inherit.aes = FALSE, linetype="dashed", size=2.5) +
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) +
scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) +
geom_text(data=dres.stk2, aes(x=247, y=.026, label=paste("N mutant fragments =",n_mut)), size=11, color=colmutwt[1]) +
geom_text(data=dres.stk2, aes(x=247, y=.03, label=paste("N wild-type fragments =",n_wt)), size=11, color=colmutwt[2]) +
#facet_wrap(.~sample2, scales="free_y", ncol=1) + 
noleg + 
#theme(legend.position='left') + 
scale_color_manual(name="Status\nof fragments\nat TP53 Y220", 
                   values=colmutwt, labels=c("Mutant","Wild-type")) + 
theme(legend.box="horizontal")

options(repr.plot.width=12,repr.plot.height=6)
ggstk2
Warning message in geom_vline(data = dres.stk2, aes(xintercept = peak_mut), col = colmutwt[1], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message in geom_vline(data = dres.stk2, aes(xintercept = peak_wt), col = colmutwt[2], :
“Ignoring unknown parameters: `inherit.aes`”
No description has been provided for this image
In [ ]:

In [207]:
ggyc2 <- ggplot(yc2) + 
geom_density(size=3.5, aes(x=Fragment_size, color=Fragment_Status)) +
theme_classic() + gtheme(40) +
xlab("Fragment length (bp)") +
ylab("Density") +
geom_vline(data=dres.yc2,aes(xintercept=peak_mut),col=colmutwt[1], inherit.aes = FALSE, linetype="dashed", size=2.5) +
geom_vline(data=dres.yc2,aes(xintercept=peak_wt),col=colmutwt[2], inherit.aes = FALSE, linetype="dashed", size=2.5) +
#scale_x_continuous(limits=c(70,310), breaks=c(100, 145, 150, 167, 200, 250, 300), guide = guide_axis(n.dodge = 2)) +
scale_x_continuous(limits=c(70,310), breaks=c(100, 150, 200, 250, 300), guide = guide_axis(n.dodge = 1)) +
geom_text(data=dres.yc2, aes(x=247, y=.026, label=paste("N mutant fragments =",n_mut)), size=11, color=colmutwt[1]) +
geom_text(data=dres.yc2, aes(x=247, y=.03, label=paste("N wild-type fragments =",n_wt)), size=11, color=colmutwt[2]) +
#facet_wrap(.~sample2, scales="free_y", ncol=1) + 
noleg + 
#theme(legend.position='left') + 
scale_color_manual(name="Status\nof fragments\nat TP53 Y220", 
                   values=colmutwt, labels=c("Mutant","Wild-type")) + 
theme(legend.box="horizontal")

options(repr.plot.width=12,repr.plot.height=6)
ggyc2
Warning message in geom_vline(data = dres.yc2, aes(xintercept = peak_mut), col = colmutwt[1], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message in geom_vline(data = dres.yc2, aes(xintercept = peak_wt), col = colmutwt[2], :
“Ignoring unknown parameters: `inherit.aes`”
Warning message:
“Removed 3 rows containing non-finite outside the scale range (`stat_density()`).”
No description has been provided for this image
In [ ]:

In [ ]:

In [ ]:

Misc¶

In [581]:
killian <- function(df, ...) {
    ggplot(df, aes(x=x,y=y)) + geom_point(...)
}
In [583]:
caca <- data.frame(x=1:10, y=1:10)

killian(caca, size=5, color="red")
No description has been provided for this image
In [584]:
gtheme
function (size = 18) 
{
    theme(text = element_text(size = size, colour = "black"), 
        axis.text = element_text(size = size, colour = "black"), 
        axis.title = element_text(size = size, colour = "black"), 
        strip.text.x = element_text(size = size), legend.text = element_text(size = size))
}
In [208]:
sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20.0.0
Running under: macOS Sonoma 14.5

Matrix products: default
BLAS/LAPACK: /opt/homebrew/Caskroom/miniforge/base/envs/work/lib/libopenblas.0.dylib;  LAPACK version 3.12.0

locale:
[1] C/UTF-8/C/C/C/C

time zone: Europe/Paris
tzcode source: system (macOS)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggtext_0.1.2    ggpubr_0.6.0    ggsci_3.2.0     patchwork_1.3.0
 [5] MetBrewer_0.2.0 readxl_1.4.3    lubridate_1.9.4 forcats_1.0.0  
 [9] stringr_1.5.1   dplyr_1.1.4     purrr_1.0.2     readr_2.1.5    
[13] tidyr_1.3.1     tibble_3.2.1    ggplot2_3.5.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] gtable_0.3.6       xfun_0.52          rstatix_0.7.2      lattice_0.22-6    
 [5] tzdb_0.4.0         vctrs_0.6.5        tools_4.4.1        generics_0.1.3    
 [9] pkgconfig_2.0.3    Matrix_1.7-2       RColorBrewer_1.1-3 uuid_1.2-1        
[13] lifecycle_1.0.4    compiler_4.4.1     farver_2.1.2       textshaping_1.0.0 
[17] repr_1.1.7         carData_3.0-5      htmltools_0.5.8.1  Formula_1.2-5     
[21] pillar_1.10.2      car_3.1-3          crayon_1.5.3       abind_1.4-8       
[25] nlme_3.1-166       commonmark_1.9.2   tidyselect_1.2.1   digest_0.6.37     
[29] stringi_1.8.7      labeling_0.4.3     splines_4.4.1      fastmap_1.2.0     
[33] grid_4.4.1         cli_3.6.5          magrittr_2.0.3     base64enc_0.1-3   
[37] IRdisplay_1.1      broom_1.0.7        withr_3.0.2        scales_1.4.0      
[41] backports_1.5.0    IRkernel_1.3.2     timechange_0.3.0   pbdZMQ_0.3-13     
[45] ggsignif_0.6.4     cellranger_1.1.0   ragg_1.4.0         hms_1.1.3         
[49] evaluate_1.0.3     markdown_1.13      mgcv_1.9-1         rlang_1.1.6       
[53] gridtext_0.1.5     Rcpp_1.0.14        glue_1.8.0         xml2_1.3.6        
[57] jsonlite_2.0.0     R6_2.6.1           systemfonts_1.2.1 
In [ ]: